library(magrittr)
library(tidytext)

# load data
texts <- readRDS("data/inputs/chavez_discourse.rds") 

##########################################
#
#
#
# FIX LABELS AND ENCODING
#
#
#
##########################################
# remove texts with encoding issues (16 in total)
texts <- texts[type != "AlÃ³ Presidente"]
texts <- texts[!(subtype %in% c("Hugo ChÃ¡vez Candidato", "Conversaciones TelefÃ³nicas", "SalÃ³n Ayacucho, Palacio de Miraflores, Parroquia Catedral, Municipio Libertador, Caracas, Distrito Capital, Venezuela"))]
# fix misassigned labels (7 in total)
texts[type %in% c("5:00 PM", "1:40 PM", "12:34 AM", "5:00 PM", "3:45 PM", "2:00 AM - 4:00 AM", "10:25 AM"), "subtype"] <- NA
texts[type == "5:00 PM", "type"] <- "Discursos y Alocuciones"
texts[type %in% c("1:40 PM", "12:34 AM"), "type"] <- "Encuentros y Coloquios"
texts[type %in% c("5:00 PM", "3:45 PM", "2:00 AM - 4:00 AM"), "type"] <- "Entrevistas y Declaraciones"
texts[type == "10:25 AM", "type"] <- "Escritos"
texts <- texts[!(type == "Escritos" & !(subtype %in% c("Líneas de Chávez", "Documentos", "Cartas y Mensajes", "Textos Diversos", "Twitter @chavezcandanga", "Libros", "Programas en Vivo")))]
# remove non-texts (these are links to books and documents)
texts <- texts[!(subtype %in% c("Documentos", "Libros"))]
# correct mis-assgined columns (applies to tweets. Easier to do it here than write a scraper specific to tweets)
texts[subtype == "Twitter @chavezcandanga", "text"] <- texts[subtype == "Twitter @chavezcandanga", "address"]
texts[subtype == "Twitter @chavezcandanga", "address"] <- NA

##########################################
#
#
#
# CLEAN TEXT
#
#
#
##########################################
texts[,"text"] <- sapply(texts[,"text"], function(x) gsub("\\[.*?\\]", " ", x) )                                        # remove all text in [], these are descriptive (e.g. [risas])
texts[,"text"] <- sapply(texts[,"text"], function(x) gsub("\\(.*?\\)", " ", x) )                                        # remove all text in (), these are descriptive (e.g. (risas)
texts[,"text"] <- sapply(texts[,"text"], function(x) chartr("àèìòùáéíóúüñÀÈÌÒÙÁÉÍÓÚÑ", "aeiouaeiouunAEIOUAEIOUN", x))   # replace special characters
texts[,"text"] <- sapply(texts[,"text"], function(x) gsub("[^[:alpha:]]", " ", x))                                      # replace non-alpha characters
texts[,"text"] <- sapply(texts[,"text"], function(x) gsub("Alo Presidente", " ", x))                                    # remove references to show
texts[,"text"] <- sapply(texts[,"text"], function(x) gsub("\\b\\w{1,3}\\b", " ", x))                                    # remove 1-3 letter words
texts[,"text"] <- sapply(texts[,"text"], function(x) gsub("^ +| +$|( ) +", "\\1", x))                                   # remove excess white space
texts[,"text"] <- sapply(texts[,"text"], tolower)                                                                       # lowercase
texts <- texts[!is.na(text)]                                                                                            # remove empty texts

##########################################
#
#
#
# EXPLORE BASIC COUNTS
#
#
#
##########################################
texts[, .N, by =type]
texts[, .N, by =.(type, subtype)]

# save
saveRDS(texts, "data/outputs/chavez_discourse_preprocessed.rds")










